{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> 该章内容几乎等同于数据预处理，鉴于可以用pandas实现相同功能，因此只做了部分笔记"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = tf.range(10)\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = tf.data.Dataset.from_tensor_slices(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(0, shape=(), dtype=int32)\n",
      "tf.Tensor(1, shape=(), dtype=int32)\n",
      "tf.Tensor(2, shape=(), dtype=int32)\n",
      "tf.Tensor(3, shape=(), dtype=int32)\n",
      "tf.Tensor(4, shape=(), dtype=int32)\n",
      "tf.Tensor(5, shape=(), dtype=int32)\n",
      "tf.Tensor(6, shape=(), dtype=int32)\n",
      "tf.Tensor(7, shape=(), dtype=int32)\n",
      "tf.Tensor(8, shape=(), dtype=int32)\n",
      "tf.Tensor(9, shape=(), dtype=int32)\n"
     ]
    }
   ],
   "source": [
    "for i in dataset:\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 链式转换"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = dataset.repeat(3).batch(7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)\n",
      "tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)\n",
      "tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)\n",
      "tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)\n",
      "tf.Tensor([8 9], shape=(2,), dtype=int32)\n"
     ]
    }
   ],
   "source": [
    "for i in dataset:\n",
    "    print(i)\n",
    "# 将数据重复3次，7个元素为一个批次分组，均匀遍历此数据，因此最后一个输出大小为2\n",
    "# 可以使用.batch(7, drop_remainder=True)来删除末端批次"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = dataset.map(lambda x: x * 2) # 可以使用map()方法转换每个元素"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)\n",
      "tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)\n",
      "tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)\n",
      "tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)\n",
      "tf.Tensor([16 18], shape=(2,), dtype=int32)\n"
     ]
    }
   ],
   "source": [
    "for i in dataset:\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = dataset.apply(tf.data.Dataset.unbatch)\n",
    "# apply()方法应用与整个数据集\n",
    "# unbatch()解除批次，使每个元素成为一个单独的整数张量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(0, shape=(), dtype=int32)\n",
      "tf.Tensor(2, shape=(), dtype=int32)\n",
      "tf.Tensor(4, shape=(), dtype=int32)\n",
      "tf.Tensor(6, shape=(), dtype=int32)\n",
      "tf.Tensor(8, shape=(), dtype=int32)\n",
      "tf.Tensor(10, shape=(), dtype=int32)\n",
      "tf.Tensor(12, shape=(), dtype=int32)\n",
      "tf.Tensor(14, shape=(), dtype=int32)\n",
      "tf.Tensor(16, shape=(), dtype=int32)\n",
      "tf.Tensor(18, shape=(), dtype=int32)\n",
      "tf.Tensor(0, shape=(), dtype=int32)\n",
      "tf.Tensor(2, shape=(), dtype=int32)\n",
      "tf.Tensor(4, shape=(), dtype=int32)\n",
      "tf.Tensor(6, shape=(), dtype=int32)\n",
      "tf.Tensor(8, shape=(), dtype=int32)\n",
      "tf.Tensor(10, shape=(), dtype=int32)\n",
      "tf.Tensor(12, shape=(), dtype=int32)\n",
      "tf.Tensor(14, shape=(), dtype=int32)\n",
      "tf.Tensor(16, shape=(), dtype=int32)\n",
      "tf.Tensor(18, shape=(), dtype=int32)\n",
      "tf.Tensor(0, shape=(), dtype=int32)\n",
      "tf.Tensor(2, shape=(), dtype=int32)\n",
      "tf.Tensor(4, shape=(), dtype=int32)\n",
      "tf.Tensor(6, shape=(), dtype=int32)\n",
      "tf.Tensor(8, shape=(), dtype=int32)\n",
      "tf.Tensor(10, shape=(), dtype=int32)\n",
      "tf.Tensor(12, shape=(), dtype=int32)\n",
      "tf.Tensor(14, shape=(), dtype=int32)\n",
      "tf.Tensor(16, shape=(), dtype=int32)\n",
      "tf.Tensor(18, shape=(), dtype=int32)\n"
     ]
    }
   ],
   "source": [
    "for i in dataset:\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = dataset.filter(lambda x: x < 10) # 过滤出x小于10的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(0, shape=(), dtype=int32)\n",
      "tf.Tensor(2, shape=(), dtype=int32)\n",
      "tf.Tensor(4, shape=(), dtype=int32)\n",
      "tf.Tensor(6, shape=(), dtype=int32)\n",
      "tf.Tensor(8, shape=(), dtype=int32)\n",
      "tf.Tensor(0, shape=(), dtype=int32)\n",
      "tf.Tensor(2, shape=(), dtype=int32)\n",
      "tf.Tensor(4, shape=(), dtype=int32)\n",
      "tf.Tensor(6, shape=(), dtype=int32)\n",
      "tf.Tensor(8, shape=(), dtype=int32)\n",
      "tf.Tensor(0, shape=(), dtype=int32)\n",
      "tf.Tensor(2, shape=(), dtype=int32)\n",
      "tf.Tensor(4, shape=(), dtype=int32)\n",
      "tf.Tensor(6, shape=(), dtype=int32)\n",
      "tf.Tensor(8, shape=(), dtype=int32)\n"
     ]
    }
   ],
   "source": [
    "for i in dataset:\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 乱序数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)\n",
      "tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)\n",
      "tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)\n",
      "tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)\n",
      "tf.Tensor([3 6], shape=(2,), dtype=int64)\n"
     ]
    }
   ],
   "source": [
    "dataset = tf.data.Dataset.range(10).repeat(3)\n",
    "dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)\n",
    "for item in dataset:\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "7b4b4feff2f24a0f0a34464dbe537a36fda679851528fb8735cb41fa49dffb2d"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.15"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
